# -*- coding: utf-8 -*-
# time: 2025/4/28 14:09
# file: 文本推理.py
# author: hanson

"""
文本推理
pip install ms-swift

"""

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from swift.llm import PtEngine, RequestConfig, InferRequest
model = r"E:\soft\model\qwen\Qwen\Qwen2___5-0___5B-Instruct"

# 加载推理引擎
engine = PtEngine(model, device_map='auto')
request_config = RequestConfig(max_tokens=512, temperature=0)

# 这里使用了2个infer_request来展示batch推理
infer_requests = [
    InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}]),
    InferRequest(messages=[{'role': 'user', 'content': '浙江的省会在哪？'},
                           {'role': 'assistant', 'content': '浙江省的省会是杭州。'},
                           {'role': 'user', 'content': '这里有什么好玩的地方'},]),
]
resp_list = engine.infer(infer_requests, request_config)
query0 = infer_requests[0].messages[0]['content']
print(f'response0: {resp_list[0].choices[0].message.content}')
print(f'response1: {resp_list[1].choices[0].message.content}')